Alexandra Cathcart
| Method | KNN(k=5) | LDA | QDA | LR | RF(tuning parameter: mtry=3; ntrees=500) | SVM(tuning parameter: cost=10; gamma=1; sigma = 8.691262; C = 1) |
|---|---|---|---|---|---|---|
| Accuracy | 99.6% | 98.3% | 99.4% | 99.5% | 99.7% | 99.7% |
| AUC | 99.8% | 98.7% | 99.8% | 99.8% | 99.0% | 99.4% |
| ROC | in tabs | in tabs | in tabs | in tabs | in tabs | in tabs |
| Threshold | .66 | .85 | .40 | .29 | .50 | .52 |
| Sensitivity | 93.1% | 74.0% | 85.1% | 91.0% | 95.2% | 94.0% |
| Specificity | 99.9% | 99.2% | 99.6% | 99.8% | 99.8% | 99.9% |
| FDR | 2.2% | 24.6% | 1.7% | 4.7% | 4.8% | .028% |
| Precision | 97.8% | 75.4% | 98.3% | 95.3% | 95.2% | 97.2% |
library(tidyverse)
library(dplyr)
library(caret)
library(class)
library(yardstick)
library(plotly)
library(boot)
library(pROC)
library(glmnet)
library(purrr)
library(gridExtra)
library(randomForest)
library(e1071)
install.packages("kernlab")#reading in the data
data <- read.csv("HaitiPixels.csv", header=TRUE ,sep=",")
data <- data %>%
mutate(BlueClass = as.factor(ifelse(Class=="Blue Tarp","Yes", "No")))#check the levels just specified
levels(data$BlueClass)## [1] "No" "Yes"
#set data var to be columns 2-5 of the set
data = data[c(2:5)]data <- data %>% mutate(id = row_number())
#check addition
head(data$id)## [1] 1 2 3 4 5 6
#shuffle data to fairly split into test / train
shuffleddata = sample_n(data, nrow(data))
#check that it has been shuffled
head(shuffleddata$id) #different then the first six lines of the csv file## [1] 19746 61512 18716 7512 42719 8307
#remove the id column
shuffleddata = shuffleddata[c(1:4)]#split the data into test and train for use in our upcoming models
#using a 10k subset for faster knn function execution and file freezing issues
samp <- 1:10000
samp2 <- 10001:20000
train<-shuffleddata[samp,]
test <- shuffleddata[samp2,]
head(train)## Red Green Blue BlueClass
## 1 77 71 58 No
## 2 216 241 255 Yes
## 3 66 68 54 No
## 4 113 117 86 No
## 5 255 253 179 No
## 6 83 83 68 No
head(test)## Red Green Blue BlueClass
## 10001 255 228 186 No
## 10002 169 167 154 No
## 10003 101 97 74 No
## 10004 69 71 52 No
## 10005 77 87 54 No
## 10006 255 255 201 No
#model training rules for all models
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#KNN model
system.time({
knnmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="knn",preProcess = c("center","scale"), tuneGrid = expand.grid(k = c(1:15)))
})## user system elapsed
## 14.576 0.075 14.705
knnmod## k-Nearest Neighbors
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9978 0.9664027
## 2 0.9970 0.9547513
## 3 0.9978 0.9667094
## 4 0.9974 0.9610710
## 5 0.9974 0.9609979
## 6 0.9974 0.9608520
## 7 0.9975 0.9623564
## 8 0.9972 0.9575442
## 9 0.9975 0.9624421
## 10 0.9970 0.9549201
## 11 0.9974 0.9610211
## 12 0.9973 0.9597111
## 13 0.9974 0.9609377
## 14 0.9970 0.9547721
## 15 0.9971 0.9560582
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
#plot KNN model
plot(knnmod)#set prediction, probability, and cv score variables in case needed
knnmod_pred <- predict(knnmod, test,'raw')
knnmod_prob <- predict(knnmod, test,'prob')
knnmod_scored <- cbind(test, knnmod_pred, knnmod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
knn_auc = knnmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)` -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
knn_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.994
#ROC curve + plot
ROC_curve<-knnmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('KNN ROC curve')
ggplotly(ROC_curve_plot)#set threshold
knnmod_pred2 <- knnmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .66, 99.7% best.
mutate(prediction = ifelse(Yes>.66, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#confusion matrix
confusionMatrix(knnmod_pred2$prediction, knnmod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9646 8
## Yes 14 332
##
## Accuracy : 0.9978
## 95% CI : (0.9967, 0.9986)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9668
##
## Mcnemar's Test P-Value : 0.2864
##
## Sensitivity : 0.9765
## Specificity : 0.9986
## Pos Pred Value : 0.9595
## Neg Pred Value : 0.9992
## Prevalence : 0.0340
## Detection Rate : 0.0332
## Detection Prevalence : 0.0346
## Balanced Accuracy : 0.9875
##
## 'Positive' Class : Yes
##
#LDA model
system.time({
ldamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="lda",preProcess = c("center","scale"), family="binomial")
})## user system elapsed
## 0.722 0.006 0.734
ldamod## Linear Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9831 0.7540222
#set prediction, probability, and cv score variables in case needed
ldamod_pred <- predict(ldamod, test,'raw')
ldamod_prob <- predict(ldamod, test,'prob')
ldamod_scored <- cbind(test, ldamod_pred, ldamod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
lda_auc = ldamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
lda_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.989
#ROC curve + plot
ROC_curve2<-ldamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot2 <- ROC_curve2 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot2)#set new threshold
ldamod_pred2 <- ldamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .85, 98.6% best.
mutate(prediction = ifelse(Yes>.85, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(ldamod_pred2$prediction, ldamod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9587 79
## Yes 73 261
##
## Accuracy : 0.9848
## 95% CI : (0.9822, 0.9871)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7666
##
## Mcnemar's Test P-Value : 0.6851
##
## Sensitivity : 0.7676
## Specificity : 0.9924
## Pos Pred Value : 0.7814
## Neg Pred Value : 0.9918
## Prevalence : 0.0340
## Detection Rate : 0.0261
## Detection Prevalence : 0.0334
## Balanced Accuracy : 0.8800
##
## 'Positive' Class : Yes
##
#QDA model
system.time({
qdamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="qda",preProcess = c("center","scale"), family="binomial")
})## user system elapsed
## 0.707 0.007 0.718
qdamod## Quadratic Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9946 0.9101972
##set prediction, probability, and cv score variables in case needed
qdamod_pred <- predict(qdamod, test,'raw')
qdamod_prob <- predict(qdamod, test,'prob')
qdamod_scored <- cbind(test, qdamod_pred, qdamod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = qdamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.999
#ROC curve + plot
ROC_curve3<-qdamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot3 <- ROC_curve3 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot3)#set new threshold
qdamod_pred2 <- qdamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .40, 99.5% best.
mutate(prediction = ifelse(Yes>.40, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(qdamod_pred2$prediction, qdamod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9658 52
## Yes 2 288
##
## Accuracy : 0.9946
## 95% CI : (0.993, 0.9959)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9115
##
## Mcnemar's Test P-Value : 2.592e-11
##
## Sensitivity : 0.8471
## Specificity : 0.9998
## Pos Pred Value : 0.9931
## Neg Pred Value : 0.9946
## Prevalence : 0.0340
## Detection Rate : 0.0288
## Detection Prevalence : 0.0290
## Balanced Accuracy : 0.9234
##
## 'Positive' Class : Yes
##
#GLM model
system.time({
glmmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="glm",preProcess = c("center","scale"), family="binomial")
})## user system elapsed
## 1.264 0.032 1.303
glmmod## Generalized Linear Model
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9956 0.9298505
##et prediction, probability, and cv score variables in case needed
glmmod_pred <- predict(glmmod, test,'raw')
glmmod_prob <- predict(glmmod, test,'prob')
glmmod_scored <- cbind(test, glmmod_pred, glmmod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = glmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.998
#ROC curve + plot
ROC_curve4<-glmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)#set new threshold
glmmod_pred2 <- glmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .29, 99.6% best.
mutate(prediction = ifelse(Yes>.29, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(glmmod_pred2$prediction, glmmod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9646 30
## Yes 14 310
##
## Accuracy : 0.9956
## 95% CI : (0.9941, 0.9968)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9315
##
## Mcnemar's Test P-Value : 0.02374
##
## Sensitivity : 0.9118
## Specificity : 0.9986
## Pos Pred Value : 0.9568
## Neg Pred Value : 0.9969
## Prevalence : 0.0340
## Detection Rate : 0.0310
## Detection Prevalence : 0.0324
## Balanced Accuracy : 0.9552
##
## 'Positive' Class : Yes
##
#Random Forest model
#Choosing tuning parameters:
#https://discuss.analyticsvidhya.com/t/how-to-decide-no-of-ntrees-in-randomforest/6882/3
#https://rpubs.com/phamdinhkhanh/389752
#Create control function for training with 10 folds and keep 3 folds for training.
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#https://stackoverflow.com/questions/10085806/extracting-specific-columns-from-a-data-frame
df<- train %>%
select(Red,Green,Blue)
#mtryStart defaults at sqrt(p)
#my available threshold for mtry values is pretty low based on the size of my dataset
(tuneRF(df,train$BlueClass,mtry = 5, ntree = 500, stepFactor=5, improve=0.05,
trace=TRUE, plot=TRUE, doBest=TRUE))## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## invalid mtry: reset to within valid range
## mtry = 5 OOB error = 0.28%
## Searching left ...
## mtry = 1 OOB error = 0.27%
## 0.03571429 0.05
## Searching right ...
## mtry = 3 OOB error = 0.28%
## 0 0.05
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 0.27%
## Confusion matrix:
## No Yes class.error
## No 9651 9 0.000931677
## Yes 18 322 0.052941176
#mtry = 3
##------
tunegrid <- expand.grid(.mtry = 3)
modellist <- list()
#train with different ntree parameters and inspect bias/variance tradeoff
#findtrees1 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 50)
#findtrees1
#findtrees2 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 100)
#findtrees2
system.time({
RF <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'rf',
metric = 'Accuracy',
tuneGrid = tunegrid,
trControl = train_control,
ntree = 500)
})## user system elapsed
## 11.778 1.213 13.040
RF## Random Forest
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.997 0.9544314
##
## Tuning parameter 'mtry' was held constant at a value of 3
##et prediction, probability, and cv score variables in case needed
rfmod_pred <- predict(RF, test,'raw')
rfmod_prob <- predict(RF, test,'prob')
rfmod_scored <- cbind(test, rfmod_pred, rfmod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
rf_auc = rfmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
rf_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.991
#ROC curve + plot
ROC_curve4<-rfmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)#set new threshold
rfmod_pred2 <- RF$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .50, 99.7% best.
mutate(prediction = ifelse(Yes>.50, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(rfmod_pred2$prediction, rfmod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9644 14
## Yes 16 326
##
## Accuracy : 0.997
## 95% CI : (0.9957, 0.998)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9545
##
## Mcnemar's Test P-Value : 0.8551
##
## Sensitivity : 0.9588
## Specificity : 0.9983
## Pos Pred Value : 0.9532
## Neg Pred Value : 0.9986
## Prevalence : 0.0340
## Detection Rate : 0.0326
## Detection Prevalence : 0.0342
## Balanced Accuracy : 0.9786
##
## 'Positive' Class : Yes
##
#Choosing tuning parameters:
#Linear
set.seed(1)
tune.out.linear=tune(svm,BlueClass~Red+Green+Blue,data=train,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
#Radial
tune.out.radial=tune(svm, BlueClass~Red+Green+Blue,data=train, kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
#lowest error is radial cost 10 gamma 1
#Poly
tune.out.poly=tune(svm,BlueClass~Red+Green+Blue,data=train, kernel="polynomial", ranges=list(cost=c(0.1,1,10,100,1000),degree=c(1,2,3,4,5)))
##------
system.time({
svmmod <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'svmRadial',
metric = 'Accuracy',
trControl = train_control,
cost = 10,
gamma = 1,
preProcess = c("center","scale")
)
})## user system elapsed
## 15.985 0.124 16.179
#Set prediction, probability, and cv score variables in case needed
svmmod_pred <- predict(svmmod, test,'raw')
svmmod_prob <- predict(svmmod, test,'prob')
svmmod_scored <- cbind(test, svmmod_pred, svmmod_prob)#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
svmmod_auc = svmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
svmmod_auc## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.999
#ROC curve + plot
ROC_curve4<-svmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)#set new threshold
svmmod_pred2 <- svmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .52, 99.7% best.
mutate(prediction = ifelse(Yes>.52, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(svmmod_pred2$prediction, svmmod_pred2$obs, positive="Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9649 15
## Yes 11 325
##
## Accuracy : 0.9974
## 95% CI : (0.9962, 0.9983)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9602
##
## Mcnemar's Test P-Value : 0.5563
##
## Sensitivity : 0.9559
## Specificity : 0.9989
## Pos Pred Value : 0.9673
## Neg Pred Value : 0.9984
## Prevalence : 0.0340
## Detection Rate : 0.0325
## Detection Prevalence : 0.0336
## Balanced Accuracy : 0.9774
##
## 'Positive' Class : Yes
##